In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.metrics import mean_squared_error
%matplotlib inline
In [2]:
# 读取数据
wage_df = pd.read_csv('data/Wage.csv')
wage_df.head()
Out[2]:
In [4]:
from sklearn.preprocessing import PolynomialFeatures
In [21]:
X = wage_df[['age']].values
y = wage_df['wage']
#将特征换为多项式
poly = PolynomialFeatures(degree=4)
X_degree4 = poly.fit_transform(X)
In [23]:
ols = sm.OLS(y,X_degree4).fit()
ols.summary()
Out[23]:
In [41]:
#图形显示拟合的形状
fig,ax = plt.subplots()
ax.scatter(x = X,y=y)
ax.set_xlabel('age')
ax.set_ylabel('wage')
xs = range(int(np.min(X[:,0])),int(np.max(X[:,0])))
xs_degree4 = [poly.fit_transform(x) for x in xs]
ys = [ols.predict(x)[0] for x in xs_degree4]
ax.plot(xs,ys,'r',linewidth=2.5)
Out[41]:
In [49]:
from scipy.interpolate import LSQUnivariateSpline
In [52]:
x = wage_df['age'].values
y = wage_df['wage'].values
x = [i for (i,j) in sorted(zip(x,y))]
y = [j for (i,j) in sorted(zip(x,y))]
t=[20,40,60]
sql = LSQUnivariateSpline(x,y,t)
In [53]:
fig,ax = plt.subplots()
ax.scatter(x,y)
ax.set_xlabel('age')
ax.set_ylabel('wage')
xs = np.linspace(x[0],x[-1])
ax.plot(xs,sql(xs),'g-',lw=3,)
Out[53]:
In [ ]: